1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(tidyverse)
  library(tximport)
  library(vsn)
  library(zinbwave)
  library(Rtsne)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
samples <- read_csv(here("doc/samples_B2.csv"))
## Rows: 24 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ScilifeID, SubmittedID, Stages, Description, ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2 Raw data

filelist <- list.files(here("data/Salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Sanity check to ensure that the data is sorted according to the sample info

filelist <- filelist[match(samples$ScilifeID,sub("_sortmerna.*","",basename(dirname(filelist))))]

stopifnot(all(match(sub("_sortmerna.*","",basename(dirname(filelist))),
                    samples$ScilifeID) == 1:nrow(samples)))

name the file list vector

names(filelist) <- samples$ID

Read the expression at the gene level

counts <- suppressMessages(round(tximport(files = filelist, 
                                  type = "salmon",
                                  txOut=TRUE)$counts))

combine technical replicates

samples$ID <- sub("_L00[1,2]", "",
                  samples$ScilifeID)
counts <- do.call(
  cbind,
  lapply(split.data.frame(t(counts),
                          samples$ID),
         colSums))

csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$ID),]

read the expression for the pool of lincRNAs we found

linc_read <- read_delim("~/Git/lncRNAs/doc/time_expression_nc_filtered.tsv",
                        delim = " ")
## Rows: 169731 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr  (2): Transcript.ID, peak
## dbl (11): score, S1, S2, S3, S4, S5, S6, S7, S8, maxn, n
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
load(here("doc/extra_filtering.rda"))
linc_read <- linc_read[! linc_read$Transcript.ID %in% removing,]

linc <- linc_read$Transcript.ID

counts <- counts[linc, ]

2.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "12.9% percent (21769) of 168333 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by Stages
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(csamples)

ggplot(dat,aes(x,y,fill=samples$Stages)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected, considering we have lincRNAs, caracterised by a really low signal.

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 21769 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by Stages.

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Stages=csamples$Stages[match(ind,csamples$ID)])

ggplot(dat,aes(x=values,group=ind,col=Stages)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 3505842 rows containing non-finite values (stat_density).

2.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_linc_new.csv"))

3 Data normalisation

3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

csamples$Stages <- factor(csamples$Stages)

there are a lot of zeros, so we use zinbwave

se <- SummarizedExperiment(assays=list(counts=as.matrix(counts)),
                           colData=as.data.frame(csamples))

Remove non expressed

se <- se[rowSums(assay(se))>0,]

zinb <- zinbwave(se,K=0,epsilon=1e12,
                 X="~Stages",
                 observationalWeights=TRUE)


save(zinb,file=here("data/analysis/salmon/zinb_new.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- DESeqDataSet(zinb,design=~Stages)
## converting counts to integer mode
dds <- DESeq(dds, 
             sfType = "poscounts", 
             useT = TRUE, 
             minmu = 1e-6)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## Warning in getAndCheckWeights(object, modelMatrix, weightThreshold = weightThreshold): for 10317 row(s), the weights as supplied won't allow parameter estimation, producing a
##   degenerate design matrix. These rows have been flagged in mcols(dds)$weightsFail
##   and treated as if the row contained all zeros (mcols(dds)$allZero set to TRUE).
##   If you are blocking for donors/organisms, consider design = ~0+donor+condition.
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
save(dds,file=here("data/analysis/salmon/dds_linc_new.rda"))

3.2 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
save(vst,file=here("data/analysis/DE/vst-blind_linc_new.rda"))

3.3 Variance Stabilising Transformation

vsda <- varianceStabilizingTransformation(dds, blind=FALSE)
vsta <- assay(vsda)
vsta <- vsta - min(vsta)
save(vsta,file=here("data/analysis/DE/vst-aware_linc_new.rda"))

# prepare the data to build the network
#ID <- rownames(vsta)
#vsta <- cbind(ID,vsta)
#vsta_tibble <- as_tibble(vsta)
#write_tsv(vsta_tibble,path=here("data/analysis/DE/vst-aware_linc.tsv"))
#thing <- read_tsv(here("data/analysis/DE/vst-aware_linc.tsv"))
#library(Biostrings)
#seq <- readDNAStringSet("data/trinity/Trinity.fasta")
#names(seq) <- sub(" .*","",names(seq))
#IDs <- thing$ID
#writeXStringSet(seq[IDs],file="~/Git/lncRNAs/data/analysis/DE/linc_all.fasta")
  • Validation Check the variance stabilisation. It could be worse, considering we have a pool of lincRNAs.
meanSdPlot(log2(counts(dds)[!mcols(dds)$allZero,]+1))

meanSdPlot(log2(assay(zinb)+1))

meanSdPlot(vst[rowSums(vst)>0,])

meanSdPlot(vsta[rowSums(vsta)>0,]) 

3.4 QC on the normalised data

3.4.1 PCA

#load("data/analysis/DE/vst-aware_linc.rda")
#load("data/analysis/salmon/dds_linc.rda")
pc <- prcomp(t(vsta))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=1

And the number of possible combinations

nlevel=nlevels(dds$Stages)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
## Warning: Removed 2 row(s) containing missing values (geom_path).

3.4.2 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    csamples)

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Stages,text=dds$ID)) + 
  geom_point(size=2) + 
  theme_classic() +
  ggtitle("Principal Component Analysis lincRNAs") +
          #,subtitle="variance stabilized counts"
  labs(x=paste("PC1 (",percent[1],"%)",sep=""),
       y=paste("PC2 (",percent[2],"%)",sep="")) +
  theme(text=element_text(size=12)) +
  #theme(plot.title=element_text(size=10)) +
  theme(plot.title = element_text(face = "bold"))
p  

plot(p + labs(x=paste("PC1 (",percent[1],"%)",sep=""),
              y=paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

3.4.3 Heatmap

Filter for noise

conds <- factor(csamples$Stages)
sels <- rangeFeatureSelect(counts=vsta,
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 2 y values <= 0 omitted from
## logarithmic plot

vst.cutoff <- 1
  • Heatmap of “all” genes
mar <- par("mar")

par(mar=c(0.05,0.05,0.05,0.05)) 
hm <- heatmap.2(t(scale(t(vsta[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=conds)

3.5 Conclusion

# The Biological QA is good, considering it's based on lincRNAs. We have no outliers. 
# The sequencing depth decreased comparing to the previous analysis.
# Looking at the PCA, it could be interesting to do DE analysis to see in particular what's going on
# in S3 and S6. I consider those two stages relevant, because I think things are changes here. 
# 

4 Session Info

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] Rtsne_0.16                  zinbwave_1.18.0            
##  [3] SingleCellExperiment_1.18.0 vsn_3.64.0                 
##  [5] tximport_1.24.0             forcats_0.5.2              
##  [7] stringr_1.4.1               dplyr_1.0.10               
##  [9] purrr_0.3.4                 readr_2.1.2                
## [11] tidyr_1.2.1                 tibble_3.1.8               
## [13] tidyverse_1.3.2             plotly_4.10.0              
## [15] pander_0.6.5                hyperSpec_0.100.0          
## [17] xml2_1.3.3                  ggplot2_3.3.6              
## [19] lattice_0.20-45             here_1.0.1                 
## [21] gplots_3.1.3                DESeq2_1.36.0              
## [23] SummarizedExperiment_1.26.1 Biobase_2.56.0             
## [25] MatrixGenerics_1.8.1        matrixStats_0.62.0         
## [27] GenomicRanges_1.48.0        GenomeInfoDb_1.32.4        
## [29] IRanges_2.30.1              S4Vectors_0.34.0           
## [31] BiocGenerics_0.42.0         data.table_1.14.2          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1           backports_1.4.1        lazyeval_0.2.2        
##   [4] splines_4.2.0          crosstalk_1.2.0        BiocParallel_1.30.3   
##   [7] digest_0.6.29          htmltools_0.5.3        fansi_1.0.3           
##  [10] magrittr_2.0.3         memoise_2.0.1          googlesheets4_1.0.1   
##  [13] limma_3.52.2           tzdb_0.3.0             Biostrings_2.64.1     
##  [16] annotate_1.74.0        modelr_0.1.9           vroom_1.5.7           
##  [19] jpeg_0.1-9             colorspace_2.0-3       blob_1.2.3            
##  [22] rvest_1.0.3            haven_2.5.1            xfun_0.33             
##  [25] hexbin_1.28.2          crayon_1.5.1           RCurl_1.98-1.8        
##  [28] jsonlite_1.8.0         genefilter_1.78.0      survival_3.3-1        
##  [31] glue_1.6.2             gtable_0.3.1           gargle_1.2.1          
##  [34] zlibbioc_1.42.0        XVector_0.36.0         DelayedArray_0.22.0   
##  [37] scales_1.2.1           edgeR_3.38.4           DBI_1.1.3             
##  [40] Rcpp_1.0.9             viridisLite_0.4.1      xtable_1.8-4          
##  [43] bit_4.0.4              preprocessCore_1.58.0  htmlwidgets_1.5.4     
##  [46] httr_1.4.4             RColorBrewer_1.1-3     ellipsis_0.3.2        
##  [49] farver_2.1.1           pkgconfig_2.0.3        XML_3.99-0.10         
##  [52] sass_0.4.2             dbplyr_2.2.1           deldir_1.0-6          
##  [55] locfit_1.5-9.6         utf8_1.2.2             labeling_0.4.2        
##  [58] softImpute_1.4-1       tidyselect_1.1.2       rlang_1.0.5           
##  [61] AnnotationDbi_1.58.0   munsell_0.5.0          cellranger_1.1.0      
##  [64] tools_4.2.0            cachem_1.0.6           cli_3.4.0             
##  [67] generics_0.1.3         RSQLite_2.2.17         broom_1.0.1           
##  [70] evaluate_0.16          fastmap_1.1.0          yaml_2.3.5            
##  [73] knitr_1.40             bit64_4.0.5            fs_1.5.2              
##  [76] caTools_1.18.2         KEGGREST_1.36.3        brio_1.1.3            
##  [79] compiler_4.2.0         rstudioapi_0.14        png_0.1-7             
##  [82] testthat_3.1.4         affyio_1.66.0          reprex_2.0.2          
##  [85] geneplotter_1.74.0     bslib_0.4.0            stringi_1.7.8         
##  [88] highr_0.9              Matrix_1.4-1           vctrs_0.4.1           
##  [91] pillar_1.8.1           lifecycle_1.0.2        BiocManager_1.30.18   
##  [94] jquerylib_0.1.4        bitops_1.0-7           R6_2.5.1              
##  [97] latticeExtra_0.6-30    affy_1.74.0            KernSmooth_2.23-20    
## [100] codetools_0.2-18       gtools_3.9.3           assertthat_0.2.1      
## [103] rprojroot_2.0.3        withr_2.5.0            GenomeInfoDbData_1.2.8
## [106] hms_1.1.2              rmarkdown_2.16         googledrive_2.0.0     
## [109] lubridate_1.8.0        interp_1.1-3